
char clCode_clppScan_Default[]=
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#define T int\n"
"__kernel \n"
"void kernel__ExclusivePrefixScanSmall(\n"
"	__global T* input,\n"
"	__global T* output,\n"
"	__local  T* block,\n"
"	const uint length)\n"
"{\n"
"	int tid = get_local_id(0);\n"
"	\n"
"	int offset = 1;\n"
"	block[2*tid]     = input[2*tid];\n"
"	block[2*tid + 1] = input[2*tid + 1];	\n"
"	for(int d = length>>1; d > 0; d >>=1)\n"
"	{\n"
"		barrier(CLK_LOCAL_MEM_FENCE);\n"
"		\n"
"		if(tid<d)\n"
"		{\n"
"			int ai = offset*(2*tid + 1) - 1;\n"
"			int bi = offset*(2*tid + 2) - 1;\n"
"			\n"
"			block[bi] += block[ai];\n"
"		}\n"
"		offset *= 2;\n"
"	}\n"
"	if(tid == 0)\n"
"		block[length - 1] = 0;\n"
"	for(int d = 1; d < length ; d *= 2)\n"
"	{\n"
"		offset >>=1;\n"
"		barrier(CLK_LOCAL_MEM_FENCE);\n"
"		\n"
"		if(tid < d)\n"
"		{\n"
"			int ai = offset*(2*tid + 1) - 1;\n"
"			int bi = offset*(2*tid + 2) - 1;\n"
"			\n"
"			float t = block[ai];\n"
"			block[ai] = block[bi];\n"
"			block[bi] += t;\n"
"		}\n"
"	}\n"
"	\n"
"	barrier(CLK_LOCAL_MEM_FENCE);\n"
"	output[2*tid]     = block[2*tid];\n"
"	output[2*tid + 1] = block[2*tid + 1];\n"
"}\n"
"#define NUM_BANKS 16\n"
"#define LOG_NUM_BANKS 4\n"
"#ifdef ZERO_BANK_CONFLICTS\n"
"#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS + (index) >> (2*LOG_NUM_BANKS))\n"
"#else\n"
"#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)\n"
"#endif\n"
"__kernel\n"
"void kernel__ExclusivePrefixScan(\n"
"	__global T* dataSet,\n"
"	\n"
"	__local T* localBuffer,\n"
"	\n"
"	__global T* blockSums,\n"
"	const uint blockSumsSize\n"
"	)\n"
"{\n"
"	const uint gid = get_global_id(0);\n"
"	const uint tid = get_local_id(0);\n"
"	const uint bid = get_group_id(0);\n"
"	const uint lwz  = get_local_size(0);\n"
"	\n"
"	// The local buffer has 2x the size of the local-work-size, because we manage 2 scans at a time.\n"
"const uint localBufferSize = lwz << 1;\n"
"int offset = 1;\n"
"	\n"
"const int tid2_0 = tid << 1;\n"
"const int tid2_1 = tid2_0 + 1;\n"
"	\n"
"	const int gid2_0 = gid << 1;\n"
"const int gid2_1 = gid2_0 + 1;\n"
"	// Cache the datas in local memory\n"
"#ifdef SUPPORT_AVOID_BANK_CONFLICT\n"
"	uint ai = tid;\n"
"	uint bi = tid + lwz;\n"
"	uint gai = gid;\n"
"	uint gbi = gid + lwz;\n"
"	uint bankOffsetA = CONFLICT_FREE_OFFSET(ai); \n"
"	uint bankOffsetB = CONFLICT_FREE_OFFSET(bi);\n"
"	localBuffer[ai + bankOffsetA] = (gai < blockSumsSize) ? dataSet[gai] : 0; \n"
"	localBuffer[bi + bankOffsetB] = (gbi < blockSumsSize) ? dataSet[gbi] : 0;\n"
"#else\n"
"	localBuffer[tid2_0] = (gid2_0 < blockSumsSize) ? dataSet[gid2_0] : 0;\n"
"	localBuffer[tid2_1] = (gid2_1 < blockSumsSize) ? dataSet[gid2_1] : 0;\n"
"#endif\n"
"	\n"
"for(uint d = lwz; d > 0; d >>= 1)\n"
"	{\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"		\n"
"if (tid < d)\n"
"		{\n"
"#ifdef SUPPORT_AVOID_BANK_CONFLICT\n"
"			//uint ai = mad24(offset, (tid2_1+0), -1);	// offset*(tid2_0+1)-1 = offset*(tid2_1+0)-1\n"
"			uint i = 2 * offset * tid;\n"
"			uint ai = i + offset - 1;\n"
"			uint bi = ai + offset;\n"
"			ai += CONFLICT_FREE_OFFSET(ai);	// ai += ai / NUM_BANKS;\n"
"			bi += CONFLICT_FREE_OFFSET(bi);	// bi += bi / NUM_BANKS;\n"
"#else\n"
"const uint ai = mad24(offset, (tid2_1+0), -1);	// offset*(tid2_0+1)-1 = offset*(tid2_1+0)-1\n"
"const uint bi = mad24(offset, (tid2_1+1), -1);	// offset*(tid2_1+1)-1;\n"
"#endif\n"
"localBuffer[bi] += localBuffer[ai];\n"
"}\n"
"offset <<= 1;\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"	\n"
"	/*\n"
"	if (tid < 1)\n"
"		blockSums[bid] = localBuffer[localBufferSize-1];\n"
"		\n"
"	barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"
"	\n"
"	if (tid < 1)\n"
"		localBuffer[localBufferSize - 1] = 0;\n"
"	*/\n"
"	\n"
"if (tid < 1)\n"
"	{\n"
"#ifdef SUPPORT_AVOID_BANK_CONFLICT\n"
"		uint index = localBufferSize-1;\n"
"		index += CONFLICT_FREE_OFFSET(index);\n"
"		blockSums[bid] = localBuffer[index];\n"
"		localBuffer[index] = 0;\n"
"#else\n"
"		// We store the biggest value (the last) to the sum-block for later use.\n"
"blockSums[bid] = localBuffer[localBufferSize-1];		\n"
"		//barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);		\n"
"		// Clear the last element\n"
"localBuffer[localBufferSize - 1] = 0;\n"
"#endif\n"
"}\n"
"for(uint d = 1; d < localBufferSize; d <<= 1)\n"
"	{\n"
"offset >>= 1;\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"		\n"
"if (tid < d)\n"
"		{\n"
"#ifdef SUPPORT_AVOID_BANK_CONFLICT\n"
"			//uint ai = mad24(offset, (tid2_1+0), -1);	// offset*(tid2_0+1)-1 = offset*(tid2_1+0)-1\n"
"			uint i = 2 * offset * tid;\n"
"			uint ai = i + offset - 1;\n"
"			uint bi = ai + offset;\n"
"			ai += CONFLICT_FREE_OFFSET(ai);	// Apply an offset to the __local memory\n"
"			bi += CONFLICT_FREE_OFFSET(bi);\n"
"#else\n"
"const uint ai = mad24(offset, (tid2_1+0), -1); // offset*(tid2_0+1)-1 = offset*(tid2_1+0)-1\n"
"const uint bi = mad24(offset, (tid2_1+1), -1); // offset*(tid2_1+1)-1;\n"
"#endif\n"
"T tmp = localBuffer[ai];\n"
"localBuffer[ai] = localBuffer[bi];\n"
"localBuffer[bi] += tmp;\n"
"}\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"	\n"
"#ifdef SUPPORT_AVOID_BANK_CONFLICT\n"
"	dataSet[gai] = (gai < blockSumsSize) * localBuffer[ai + bankOffsetA];		\n"
"	dataSet[gbi] = (gbi < blockSumsSize) * localBuffer[bi + bankOffsetB];		\n"
"#else\n"
"	if (gid2_0 < blockSumsSize)\n"
"		dataSet[gid2_0] = localBuffer[tid2_0];\n"
"	if (gid2_1 < blockSumsSize)\n"
"		dataSet[gid2_1] = localBuffer[tid2_1];\n"
"#endif\n"
"}\n"
"__kernel\n"
"void kernel__UniformAdd(\n"
"	__global T* output,\n"
"	__global const T* blockSums,\n"
"	const uint outputSize\n"
"	)\n"
"{\n"
"uint gid = get_global_id(0) * 2;\n"
"const uint tid = get_local_id(0);\n"
"const uint blockId = get_group_id(0);\n"
"	\n"
"	// Intel SDK fix\n"
"	//output[gid] += blockSums[blockId];\n"
"	//output[gid+1] += blockSums[blockId];\n"
"__local T localBuffer[1];\n"
"#ifdef SUPPORT_AVOID_BANK_CONFLICT\n"
"	uint blockOffset = 1024 - 1;\n"
"if (tid < 1)\n"
"localBuffer[0] = blockSums[blockId + blockOffset];\n"
"#else\n"
"if (tid < 1)\n"
"localBuffer[0] = blockSums[blockId];\n"
"#endif\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"	\n"
"#ifdef SUPPORT_AVOID_BANK_CONFLICT\n"
"	unsigned int address = blockId * get_local_size(0) * 2 + get_local_id(0); \n"
"	\n"
"	output[address] += localBuffer[0];\n"
"output[address + get_local_size(0)] += (get_local_id(0) + get_local_size(0) < outputSize) * localBuffer[0];\n"
"#else\n"
"	if (gid < outputSize)\n"
"		output[gid] += localBuffer[0];\n"
"	gid++;\n"
"	if (gid < outputSize)\n"
"		output[gid] += localBuffer[0];\n"
"#endif\n"
"}\n"
;
